Center for Skeletal Research Data Analysis Services. Authored by Catherine M. May, Boston Children’s Hospital.
Parts of this script were adapted from the Harvard Chan Bioinformatic Core and Bioconductor. https://hbctraining.github.io/DGE_workshop_salmon_online/schedule/. As well as Lorena Pantano Harvard TH Chan School of Public Health. https://bioconductor.org/packages/release/bioc/vignettes/DEGreport/inst/doc/DEGreport.html.
# Make sure you have Bioconductor installed
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install(version = "3.11")
# BiocManager::install(c("DESeq2","AnnotationDbi","tximport","tximportData","pheatmap","clusterProfiler","DEGreport","GSEABase","DOSE","rhdf5","org.Hs.eg.db","org.Mm.eg.db","EnsDb.Hsapiens.v86","EnsDb.Mmusculus.v79","AnnotationDbi","AnnotationHub","AnnotationFilter","ensembldb","annotables","TxDb.Hsapiens.UCSC.hg19.knownGene","TxDb.Mmusculus.UCSC.mm9.knownGene"))
# You need to load the libraries every session.
library(devtools)
library(BiocManager)
library(tximport)
library(tximportData)
library(ggpubr)
library(ggpubr)
library(ggplot2)
library(ggfortify)
library(RColorBrewer)
library(pheatmap)
library(clusterProfiler)
library(DEGreport)
library(GSEABase)
library(DOSE)
library(rhdf5)
library(stringr)
library(tidyr)
library(pathview)
library(org.Mm.eg.db)
library(EnsDb.Mmusculus.v79)
library(AnnotationDbi)
library(AnnotationHub)
library(AnnotationFilter)
library(ggrepel)
library(ensembldb)
library(cowplot)
library(annotables)
library(clusterProfiler)
library(DESeq2)
library(TxDb.Mmusculus.UCSC.mm9.knownGene)
library(tidyverse) #includes both library(dplyr) library(tibble)
# Remove all environmental variables every time.
# Sometimes variables left over from previous sessions can cause errors.
rm(list=ls())
# Set your working directory. Put all data in this folder.
setwd("~/Desktop/rstudio/CSR/osteocalcin")
# For reproducibility and methods it is recommended you record your R environment.
# methods <- session_info()
# writeLines(capture.output(sessionInfo()), "sessionInfo.txt")
Import the Kallisto counts files, the metadata file and the tx2gene annotation file.
For Grcm38 using the EnsDB. You can build your own custom annotation table using the instrutions at: https://hbctraining.github.io/DGE_workshop_salmon/lessons/AnnotationDbi_lesson.html.
#tx2gene <- read.delim("tx2gene_grch38_ens94.txt") # Human
tx2gene <- read.delim("tx2gene_grcm38_v79_wtransgenes_4Aug20.txt") # Mouse with transgenes added in
Automatic file loading requires you structure your folder hierarchy in a specific way (ie. ~/Desktop/rstudio/FC04020/data/S8.kallisto/abundance.tsv). Folders ending in the .kallisto suffix will be searched, for the abundance.tsv files which will be loaded into the variable ‘files.’
dir <- ("~/Desktop/rstudio/CSR/osteocalcin/")
samples_all <- list.files(path = "./data", full.names = T, pattern="kallisto") # Load all data
samples_ko <- list.files(path = "./data", full.names = T, pattern="KO") # Load KO
samples_wt <- list.files(path = "./data", full.names = T, pattern="WT") # Load WT
samples_uk <- list.files(path = "./data", full.names = T, pattern="UK") # Load UK
# Set up your groups for later DEG contrasts
samples_ko_wt <- append(samples_ko,samples_wt) # Merge two datasets
samples_uk_wt <- append(samples_uk,samples_wt)
samples_tissue <- samples_all
# This gets the file path for each sample
files_all <- file.path(samples_all, "abundance.tsv")
files_ko_wt <- file.path(samples_ko_wt, "abundance.tsv")
files_uk_wt <- file.path(samples_uk_wt, "abundance.tsv")
files_tissue <- file.path(samples_tissue, "abundance.tsv")
# This will strip off the file path info, leaving only the names of the files
# Note: the %>% is a pipe that passes information from one argument to another
names(files_all) <- str_replace(samples_all, "./data/", "") %>% str_replace(".kallisto", "")
names(files_ko_wt) <- str_replace(samples_ko_wt, "./data/", "") %>% str_replace(".kallisto", "")
names(files_uk_wt) <- str_replace(samples_uk_wt, "./data/", "") %>% str_replace(".kallisto", "")
names(files_tissue) <- str_replace(samples_tissue, "./data/", "") %>% str_replace(".kallisto", "")
# This checks that for each sample folder there is an abundance.tsv file
all(file.exists(files_all))
all(file.exists(files_ko_wt))
all(file.exists(files_uk_wt))
all(file.exists(files_tissue))
# If you want to find out what a function does, type
# ?function
# ie ?str_replace
Imports transcript-level abundance, estimate counts and transcript lengths, and generate a matrix
# Ignore transcript version if needed ie. ENSMUSG00000000300.1, ENSMUSG00000000300.2 should not be counted as separate transcripts just because they are different versions.
# add this inside the tximport function if needed: ignoreAfterBar = TRUE, ignoreTxVersion = TRUE, txOut=F
txi_all <- tximport(files_all, type = "kallisto", tx2gene=tx2gene, ignoreTxVersion = T)
txi_ko_wt <- tximport(files_ko_wt, type = "kallisto", tx2gene=tx2gene, ignoreTxVersion = T)
txi_uk_wt <- tximport(files_uk_wt, type = "kallisto", tx2gene=tx2gene, ignoreTxVersion = T)
txi_tissue <- tximport(files_tissue, type = "kallisto", tx2gene=tx2gene, ignoreTxVersion = T)
# Check your imported data for issues
# head(txi_all$counts) # Check the first ten lines
# tail(txi_all$counts) # Check the last ten lines
# dim(txi_all$counts) # Check the dimensions
# colnames(txi_all$counts) # Check the column names
# summary(txi_all$counts) # Get a summary of the data
# class(tx2gene) # Check the class
# Create a dataframe with the counts
data <- txi_all$counts %>% round() %>% data.frame()
# Save the raw data
write.csv(data, file="~/Desktop/rstudio/CSR/osteocalcin/results/Raw_counts.csv", sep="\t", quote=F, col.names=NA)
# Read in metadata from a .csv file
meta <- read.csv("~/Desktop/rstudio/CSR/osteocalcin/osteocalcin_metadata_4Aug20.csv", stringsAsFactors=T)
# Convert from integer to factor to improve plotting
# meta$replicate <- as.factor(meta$replicate)
# meta$library <- as.factor(meta$library)
# meta$tissue <- as.factor(meta$tissue)
meta_all <- meta[c(1:14),]
meta_ko_wt <- meta[c(1:6,11:14),]
meta_uk_wt <- meta[c(7:10,11:14),]
meta_tissue <- meta[c(1:14),]
# # Check to see if a factor is integer or factor
# class(meta$replicate)
# class(meta$library)
Plot the distribution of the data and check for any potential issues.
In case you want to look at averaged data for the replicates. Not recommended for the DEG, as DESeq models will already account for replicates.
# Show(txi_avg$counts) #To find the column numbers
# Average columns manually
avg_wt <- ((txi_all$counts[,2] + txi_all$counts[,6] + txi_all$counts[,7] + txi_all$counts[,8]) / 4)
avg_ko <- ((txi_all$counts[,1] + txi_all$counts[,3] + txi_all$counts[,4] + txi_all$counts[,5] + txi_all$counts[,9] + txi_all$counts[,10]) / 6)
# Step 3: Create a new dataframe with the averaged columns
data_avg <- cbind(avg_wt, avg_ko)
#Show(data_avg)
Check for outlier samples
avg1_plot <- ggplot(data_avg) + geom_histogram(aes(x = avg_wt), stat = "bin", bins = 200) + xlim(0,20000) + ylim(0,2000) +
xlab("Raw expression counts") + ylab("Number of genes") + ggtitle("WT")
avg2_plot <- ggplot(data_avg) + geom_histogram(aes(x = avg_ko), stat = "bin", bins = 200) + xlim(0,20000) + ylim(0,2000) +
xlab("Raw expression counts") + ylab("Number of genes") + ggtitle("KO")
This will help determine what model distribution to use. Most RNA-seq data falls along a Negative Binomial (NB) distribution.
To use a poisson distribution, it assumes that mean equals variance. Plot counts mean vs. variance to check if assumption holds.
# Select data to use and find mean and var
mean_counts_wt <- apply(data[,c(11:14)], 1, mean)
variance_counts_wt <- apply(data[,c(11:14)], 1, var)
mean_counts_ko <- apply(data[,c(1:6)], 1, mean)
variance_counts_ko <- apply(data[,c(1:6)], 1, var)
# Create a new dataframe with mean and variance
df_wt <- data.frame(mean_counts_wt, variance_counts_wt)
df_ko <- data.frame(mean_counts_ko, variance_counts_ko)
The NB distribution assumes that the mean is not equal to variance. If the scatter falls on the red line, then you may have to use a different distribution. Normally the scatter falls to the left of the line (ie low mean counts have higher variance)
wt_plot <- ggplot(df_wt) + geom_point(aes(x=mean_counts_wt,
y=variance_counts_wt)) + ggtitle("WT Counts") +
geom_abline(intercept = 0, slope = 1, color="red") +
scale_y_log10(limits = c(500,1e9)) + scale_x_log10(limits = c(100,1e9))
ko_plot <- ggplot(df_ko) + geom_point(aes(x=mean_counts_ko,
y=variance_counts_ko)) + ggtitle("KO Counts") +
geom_abline(intercept = 0, slope = 1, color="red") +
scale_y_log10(limits = c(500,1e9)) + scale_x_log10(limits = c(100,1e9))
Ensure that the counts labels and metadata match.
meta_names_all <- factor(c("KO_S10_4020","KO_S4_4071","KO_S5_4071","KO_S6_4071","KO_S9_4020", "KO_S9_4071",
"UK_S4_4000","UK_S5_4000","UK_S6_4000","UK_S7_4000",
"WT_S11_4020","WT_S7_4071","WT_S8_4020","WT_S8_4071"))
meta_names_ko_wt <- factor(c("KO_S10_4020","KO_S4_4071","KO_S5_4071","KO_S6_4071","KO_S9_4020", "KO_S9_4071",
"WT_S11_4020","WT_S7_4071","WT_S8_4020","WT_S8_4071"))
meta_names_uk_wt <- factor(c("UK_S4_4000","UK_S5_4000","UK_S6_4000","UK_S7_4000",
"WT_S11_4020","WT_S7_4071","WT_S8_4020","WT_S8_4071"))
meta_names_tissue <- factor(c("KO_S10_4020","KO_S4_4071","KO_S5_4071","KO_S6_4071","KO_S9_4020", "KO_S9_4071",
"UK_S4_4000","UK_S5_4000","UK_S6_4000","UK_S7_4000",
"WT_S11_4020","WT_S7_4071","WT_S8_4020","WT_S8_4071"))
rownames(meta_all) <- meta_names_all
rownames(meta_ko_wt) <- meta_names_ko_wt
rownames(meta_uk_wt) <- meta_names_uk_wt
rownames(meta_tissue) <- meta_names_tissue
# Does data in x match data in y?
print("Does data in txi counts match data in metadata?")
all(colnames(txi_all$counts) %in% rownames(meta_all))
all(colnames(txi_ko_wt$counts) %in% rownames(meta_ko_wt))
all(colnames(txi_uk_wt$counts) %in% rownames(meta_uk_wt))
all(colnames(txi_tissue$counts) %in% rownames(meta_tissue))
# Is data in the same order in x as y?
print("Are txi counts in the same order as in metadata?")
all(colnames(txi_all$counts) == rownames(meta_all))
all(colnames(txi_ko_wt$counts) == rownames(meta_ko_wt))
all(colnames(txi_uk_wt$counts) == rownames(meta_uk_wt))
all(colnames(txi_tissue$counts) == rownames(meta_tissue))
Samples in dataframe to match sample name order in metadata
# Check sample name order in both the dataframe and metadata file
colnames(data)
# If they don't match, then use the match function to re-arrange
# First find index numbers
idx_all <- match(rownames(meta_all), colnames(txi_all$counts))
idx_ko_wt <- match(rownames(meta_ko_wt), colnames(txi_ko_wt$counts))
idx_uk_wt <- match(rownames(meta_uk_wt), colnames(txi_uk_wt$counts))
idx_tissue <- match(rownames(meta_tissue), colnames(txi_tissue$counts))
# Use this to re-order
txi_all$counts <- txi_all$counts[, idx_all]
txi_ko_wt$counts <- txi_ko_wt$counts[, idx_ko_wt]
txi_uk_wt$counts <- txi_uk_wt$counts[, idx_uk_wt]
txi_tissue$counts <- txi_tissue$counts[, idx_tissue]
# Check that you re-ordered the data correctly.
# Mismatches will cause errors in the downstream analysis.
print("Do column names match now between txi counts and metadata?")
all(colnames(txi_all$counts) %in% rownames(meta_all))
all(colnames(txi_ko_wt$counts) %in% rownames(meta_ko_wt))
all(colnames(txi_uk_wt$counts) %in% rownames(meta_uk_wt))
all(colnames(txi_tissue$counts) %in% rownames(meta_tissue))
print("Are column names now in the same order in txi counts and metadata?")
all(colnames(txi_all$counts) == rownames(meta_all))
all(colnames(txi_ko_wt$counts) == rownames(meta_ko_wt))
all(colnames(txi_uk_wt$counts) == rownames(meta_uk_wt))
all(colnames(txi_tissue$counts) == rownames(meta_tissue))
Most of the analyses will rely on accessing or recording information to the DESeq2 object
# I made a separate object for each comparison
dds_all <- DESeqDataSetFromTximport(txi_all, colData = meta_all, design = ~ library)
dds_ko_wt <- DESeqDataSetFromTximport(txi_ko_wt, colData = meta_ko_wt, design = ~ genotype)
dds_uk_wt <- DESeqDataSetFromTximport(txi_uk_wt, colData = meta_uk_wt, design = ~ genotype)
dds_tissue <- DESeqDataSetFromTximport(txi_tissue, colData = meta_tissue, design = ~ tissue)
# To build a more complex model to include other variables see below or google "design formula"
# The formula below can be interpreted as "counts explained by tissue source + batch + cell type)
# dds_complex <- DESeqDataSetFromTximport(txi, colData=meta, design = ~ tissue + batch + celltype)
# Check to make sure the objects were created correctly
# Show(counts(dds_tissue))
# dds_tissue@colData@rownames
DESeq2 uses Median of Ratios as a normalization method
# Generate size factors for the median of ratios method of normalization
dds_all <- estimateSizeFactors(dds_all)
dds_ko_wt <- estimateSizeFactors(dds_ko_wt)
dds_uk_wt <- estimateSizeFactors(dds_uk_wt)
dds_tissue <- estimateSizeFactors(dds_tissue)
# Fill the slots of the DESeqDataSet object with the appropriate information
sizeFactors(dds_all)
sizeFactors(dds_ko_wt)
sizeFactors(dds_uk_wt)
sizeFactors(dds_tissue)
# Retrieve the normalized counts matrix from dds
norm_counts_all <- counts(dds_all, normalized=TRUE)
norm_counts_ko_wt <- counts(dds_ko_wt, normalized=TRUE)
norm_counts_uk_wt <- counts(dds_uk_wt, normalized=TRUE)
norm_counts_tissue <- counts(dds_tissue, normalized=TRUE)
rlog is the standard
# rlog() function returns a DESeqTransform object, used for the PCA and hierarchical clustering
# blind=true means that the transformation in unbiased, not taking sample into consideration
rld_all <- rlog(dds_all, blind=TRUE)
rld_ko_wt <- rlog(dds_ko_wt, blind=TRUE)
rld_uk_wt <- rlog(dds_uk_wt, blind=TRUE)
rld_tissue <- rlog(dds_tissue, blind=TRUE)
Principle Component Analysis by sample groups. Data performed on normalized r-log transformed data.
# If you use the <- to assign a plot to a variable
# You can "call" the plot by the name to visualize or save it
plot1a <- plotPCA(rld_ko_wt, intgroup="genotype")
plot1a <- plot1a + ggtitle("Genotype (WT vs KO) PCA")
plot1a
rld_tissue$tissue <- as.factor(rld_tissue$tissue)
plot1b2 <- plotPCA(rld_tissue, intgroup="tissue")
plot1b2 <- plot1b2 + ggtitle("Tissue PCA")
plot1b2
rld_all$library <- as.factor(rld_all$library)
plot1b <- plotPCA(rld_all, intgroup="library")
plot1b <- plot1b + ggtitle("Library PCA")
plot1b
ggsave("PCA_library.tiff", plot = plot1b, device = "tiff", scale = 1.5, dpi = 300,
path ="~/Desktop/rstudio/CSR/osteocalcin/results", width = 7, height = 5, units = "in")
rld_all$replicate <- as.factor(rld_all$replicate)
plot1c <- plotPCA(rld_all, intgroup="replicate")
plot1c <- plot1c + ggtitle("Replicate PCA")
plot1c
ggsave("PCA_replicates.tiff", plot = plot1c, device = "tiff", scale = 1.5, dpi = 300,
path ="~/Desktop/rstudio/CSR/osteocalcin/results", width = 7, height = 5, units = "in")
Make sure that there is not a correlation of data with confounding factors, like batch or replicate number.
Pearson Correlation is the default for the function cor. Type ?cor for more info.
# Extract the rlog matrix from the object
rld_mat_all <- assay(rld_all)
rld_mat_ko_wt <- assay(rld_ko_wt)
rld_mat_uk_wt <- assay(rld_uk_wt)
rld_mat_tissue <- assay(rld_tissue)
# Compute pairwise correlation values
rld_cor_all <- cor(rld_mat_all)
rld_cor_ko_wt <- cor(rld_mat_ko_wt)
rld_cor_uk_wt <- cor(rld_mat_uk_wt)
rld_cor_tissue <- cor(rld_mat_tissue)
# Plot heatmap using the correlation matrix and the metadata object
# To find specific columns for the legend, type colnames(meta) and select the columns.
meta1 <- meta_all[,c(3:6)]
meta2 <- meta_ko_wt[,c(2:3,5)]
meta3 <- meta_uk_wt[,c(2:3,5)]
meta4 <- meta_tissue[,c(2:4)]
#all_plot1 <- pheatmap(rld_cor_all, annotation = meta1, cluster_rows = TRUE, main="Osteocalcin Heatmap (KO vs WT)", fontsize = 6)
all_plot2 <- pheatmap(rld_cor_ko_wt, annotation = meta2, cluster_rows = TRUE, main="Osteocalcin Heatmap (KO vs WT)", fontsize = 6)
all_plot3 <- pheatmap(rld_cor_uk_wt, annotation = meta3, cluster_rows = TRUE, main="Osteocalcin Heatmap (UK vs WT)", fontsize = 6)
all_plot4 <- pheatmap(rld_cor_tissue, annotation = meta4, cluster_rows = TRUE, main="Osteocalcin Heatmap (Bone vs Organ)", fontsize = 6)
The primary comparison is WT vs. KO.
Specify what factors you want to look at.
# '~ treatment' is interpreted as "read counts explained by treatment"
design(dds_all) <- formula(~ library)
design(dds_ko_wt) <- formula(~ genotype)
design(dds_uk_wt) <- formula(~ genotype)
design(dds_tissue) <- formula(~ tissue)
# Differential expression analysis based on the Negative Binomial distribution
# type ?DESeq for more info
dds_all <- DESeq(dds_all)
dds_ko_wt <- DESeq(dds_ko_wt)
dds_uk_wt <- DESeq(dds_uk_wt)
dds_tissue <- DESeq(dds_tissue)
res_all <- results(dds_all)
res_ko_wt <- results(dds_ko_wt)
res_uk_wt <- results(dds_uk_wt)
res_tissue <- results(dds_tissue)
By default DESeq2 uses the Wald test to identify genes that are differentially expressed between two sample classes. Use with 2 groups of comparison. For 3+ groups use LRT.
# Example: contrast <- c("sampletype", "overexpression", "control")
# Set up you contrasts. Base level (control) goes last
# Deciding the base level will determine how to interpret the fold change
# Compare WT vs KO
contrast_ko_wt <- c("genotype", "KO", "WT")
contrast_uk_wt <- c("genotype", "UK", "WT")
contrast_tissue <- c("tissue", "organ", "bone")
results(dds_ko_wt, contrast = contrast_ko_wt)
results(dds_uk_wt, contrast = contrast_uk_wt)
results(dds_tissue, contrast = contrast_tissue)
# Extract results
res_ko_wt <- results(dds_ko_wt, contrast = contrast_ko_wt, alpha = 0.15, lfcThreshold = 0.1) ############## THIS SHOULD BE 0.05 OR LESS ##############
res_uk_wt <- results(dds_uk_wt, contrast = contrast_uk_wt, alpha = 0.05, lfcThreshold = 0.1) ############## THIS SHOULD BE 0.05 OR LESS ##############
res_tissue <- results(dds_tissue, contrast = contrast_tissue, alpha = 0.05, lfcThreshold = 0.1) ############## THIS SHOULD BE 0.05 OR LESS ##############
# The results table that is returned to us is a DESeqResults object
# class(res_tissue)
# Check what is stored in the results
# res_tissue %>% data.frame() %>% Show()
This is important because you don’t want to include zero expression genes in your analysis as it will greatly decrease significance via multiple testing and the FDR/BH correction.
# Filter genes with zero expression in all samples
res_all[which(res_all$baseMean == 0),] %>% data.frame()
res_ko_wt[which(res_ko_wt$baseMean == 0),] %>% data.frame()
res_uk_wt[which(res_uk_wt$baseMean == 0),] %>% data.frame()
res_tissue[which(res_tissue$baseMean == 0),] %>% data.frame()
# Filter genes that have an extreme outlier
# If a sample has a gene with an extreme outlier, the p-value and adjusted p-value will be set to NA.
res_all[which(is.na(res_all$pvalue) & is.na(res_all$padj)),] %>% data.frame()
res_ko_wt[which(is.na(res_ko_wt$pvalue) & is.na(res_ko_wt$padj)),] %>% data.frame()
res_uk_wt[which(is.na(res_uk_wt$pvalue) & is.na(res_uk_wt$padj)),] %>% data.frame()
res_tissue[which(is.na(res_tissue$pvalue) & is.na(res_tissue$padj)),] %>% data.frame()
# Filter genes below the low mean threshold
# Genes with very low counts are not likely to have significant differences
res_all[which(!is.na(res_all$pvalue) & is.na(res_all$padj)),] %>% data.frame()
res_ko_wt[which(!is.na(res_ko_wt$pvalue) & is.na(res_ko_wt$padj)),] %>% data.frame()
res_uk_wt[which(!is.na(res_uk_wt$pvalue) & is.na(res_uk_wt$padj)),] %>% data.frame()
res_tissue[which(!is.na(res_tissue$pvalue) & is.na(res_tissue$padj)),] %>% data.frame()
In order to model counts, you need to know if your RNA-seq data meets the basic assumptions, including dispersion.
This fits a curve to the gene-wise dispersion estimates and plots the expected dispersion value for genes of a given expression strength. Each black dot is a gene with an associated mean expression level and maximum likelihood estimation (MLE) of the dispersion. Read more at https://hbctraining.github.io/DGE_workshop_salmon_online/lessons/04b_DGE_DESeq2_analysis.html
# Get information on each column in results
mcols(res_all, use.names=T)
mcols(res_ko_wt, use.names=T)
mcols(res_uk_wt, use.names=T)
mcols(res_tissue, use.names=T)
# Check the size factors
sizeFactors(dds_all)
sizeFactors(dds_ko_wt)
sizeFactors(dds_uk_wt)
sizeFactors(dds_tissue)
# Total number of raw counts per sample
colSums(counts(dds_all))
colSums(counts(dds_ko_wt))
colSums(counts(dds_uk_wt))
colSums(counts(dds_tissue))
# Look at the total depth after normalization
# Total number of normalized counts per sample
colSums(counts(dds_all, normalized=T))
colSums(counts(dds_ko_wt, normalized=T))
colSums(counts(dds_uk_wt, normalized=T))
colSums(counts(dds_tissue, normalized=T))
# Plot dispersion estimates
# plotDispEsts(dds_all, main="Dispersion Genotype (All)")
# plotDispEsts(dds_ko_wt, main="Dispersion Genotype (WT vs KO)")
# plotDispEsts(dds_uk_wt, main="Dispersion Genotype (WT vs UK)")
plotDispEsts(dds_tissue, main="Dispersion Tissue (Bone vs. Organ)")
Shrinkage will help to improve the accuracy of count models
# Save the unshrunken results to compare
res_ko_wt_unshrunken <- res_ko_wt
res_uk_wt_unshrunken <- res_uk_wt
res_tissue_unshrunken <- res_tissue
resultsNames(dds_ko_wt)
resultsNames(dds_uk_wt)
resultsNames(dds_tissue)
# Apply fold change shrinkage
res_ko_wt <- lfcShrink(dds_ko_wt, contrast=contrast_ko_wt, res=res_ko_wt, type="normal")
res_uk_wt <- lfcShrink(dds_uk_wt, contrast=contrast_uk_wt, res=res_uk_wt, type="normal")
res_tissue <- lfcShrink(dds_tissue, contrast=contrast_tissue, res=res_tissue, type="normal")
# MA plot using unshrunken fold changes
plotMA(res_ko_wt_unshrunken, ylim=c(-6,6))
# plotMA(res_uk_wt_unshrunken, ylim=c(-6,6))
# plotMA(res_tissue_unshrunken, ylim=c(-6,6))
# MA plot using shrunken fold changes
plotMA(res_ko_wt, ylim=c(-6,6))
# plotMA(res_uk_wt, ylim=c(-6,6))
# plotMA(res_tissue, ylim=c(-6,6))
As assumption of DESeq2 is that most genes will remain unchanged. Plotting the distribution of gene ratios between each gene and the average gene can show how true this is. If some samples show different distribution, there may be bias due to some biological or technical factor. Read more on degCheckFactors at https://rdrr.io/bioc/DEGreport/man/degCheckFactors.html.
degCheckFactors(counts(dds_tissue), each=TRUE)
Generate a Results table ith only significant genes
# Summarize results
summary(res_all, alpha = 0.15) ############## THIS SHOULD BE 0.05 OR LESS ##############
summary(res_ko_wt, alpha = 0.15) ############## THIS SHOULD BE 0.05 OR LESS ##############
summary(res_uk_wt, alpha = 0.05) ############## THIS SHOULD BE 0.05 OR LESS ##############
summary(res_tissue, alpha = 0.05) ############## THIS SHOULD BE 0.05 OR LESS ##############
# Set P adjust thresholds
padj.cutoff2 <- 0.05 ############## THIS SHOULD BE 0.05 OR LESS ##############
padj.cutoff3 <- 0.15 ############## THIS SHOULD BE 0.05 OR LESS ##############
# Create a tibble of results
# A tibble is like a dataframe; the tidyverse package requires you convert dataframes to tibbles
# To learn more about tibbles: https://r4ds.had.co.nz/tibbles.html
res_ko_wt_tb <- res_ko_wt %>% data.frame() %>% rownames_to_column(var="gene") %>% as_tibble()
res_uk_wt_tb <- res_uk_wt %>% data.frame() %>% rownames_to_column(var="gene") %>% as_tibble()
res_tissue_tb <- res_tissue %>% data.frame() %>% rownames_to_column(var="gene") %>% as_tibble()
# Subset the tibble to keep only significant genes
# This means we run less statistical tests and dont have to adjust FDR as much.
# ie. instead of adjusting for 10,000 multiple tests, you can just adjust for the most relevent 2,000 tests/genes.
sigGE_ko_wt <- res_ko_wt_tb %>% dplyr::filter(padj < padj.cutoff3)
sigGE_uk_wt <- res_uk_wt_tb %>% dplyr::filter(padj < padj.cutoff2)
sigGE_tissue <- res_tissue_tb %>% dplyr::filter(padj < padj.cutoff2)
# dim(sigGE_ko_wt) # Check how mny rows/DEGs there are
DESeq2 has its own method of normalization (not TPM or FPKM), called Median of Ratios. This normalization accounts for sequencing depth and RNA composition, but not gene length. For this reason, you should NOT do within sample comparisons because gene lengths can be very different (ie. don’t compare col1a1 to col2a1 in muscle). However, between sample comparisons are fine, because Gene A is the same length between both Group 1 and Group 2. To better understand normalization read: https://hbctraining.github.io/DGE_workshop_salmon_online/lessons/02_DGE_count_normalization.html.
# Add sampletype to the metadata
meta_all <- meta_all %>% rownames_to_column(var="samplename") %>% as_tibble()
meta_ko_wt <- meta_ko_wt %>% rownames_to_column(var="samplename") %>% as_tibble()
meta_uk_wt <- meta_uk_wt %>% rownames_to_column(var="samplename") %>% as_tibble()
meta_tissue <- meta_tissue %>% rownames_to_column(var="samplename") %>% as_tibble()
# First convert normalized_counts to a data frame and transfer the row names to a new column called "gene"
norm_counts_all <- counts(dds_all, normalized=T) %>% data.frame() %>% rownames_to_column(var="gene")
norm_counts_ko_wt <- counts(dds_ko_wt, normalized=T) %>% data.frame() %>% rownames_to_column(var="gene")
norm_counts_uk_wt <- counts(dds_uk_wt, normalized=T) %>% data.frame() %>% rownames_to_column(var="gene")
norm_counts_tissue <- counts(dds_tissue, normalized=T) %>% data.frame() %>% rownames_to_column(var="gene")
# Merge ensembl IDs the normalized counts data frame with a subset of the annotations in the tx2gene data frame
mus38 <- tx2gene %>% dplyr::select(ensgene, symbol) %>% dplyr::distinct()
# This will bring in a column of gene symbols
norm_counts_all <- merge(norm_counts_all, mus38, by.x="gene", by.y="ensgene")
norm_counts_ko_wt <- merge(norm_counts_ko_wt, mus38, by.x="gene", by.y="ensgene")
norm_counts_uk_wt <- merge(norm_counts_uk_wt, mus38, by.x="gene", by.y="ensgene")
norm_counts_tissue <- merge(norm_counts_tissue, mus38, by.x="gene", by.y="ensgene")
# Now create a tibble for the normalized counts
norm_counts_all <- norm_counts_all %>% as_tibble()
norm_counts_ko_wt <- norm_counts_ko_wt %>% as_tibble()
norm_counts_uk_wt <- norm_counts_uk_wt %>% as_tibble()
norm_counts_tissue <- norm_counts_tissue %>% as_tibble()
#colnames(norm_counts_all) #Check column names
# Normalized Counts are: Median of Ratios
norm_counts_all <- norm_counts_all[, c(1,16,2:15)] #re-order the columns
norm_counts_ko_wt <- norm_counts_ko_wt[, c(1,12,2:11)] #re-order the columns
norm_counts_uk_wt <- norm_counts_uk_wt[, c(1,10,2:9)] #re-order the columns
norm_counts_tissue <- norm_counts_tissue[, c(1,16,2:15)] #re-order the columns
Testing hyotheses about tissue=specifc and cell-specific gene expression
# Find the Ensembl ID of a particular gene.
mus38[mus38$symbol == "Col1a1", "ensgene"]
# Copy the Ensembl number and assign it to the gene name
Col1a1 <- "ENSMUSG00000022483"
Bglap <- "ENSMUSG00000074483"
Tdtomato <- "TRANSGENES"
# Save plotcounts to a data frame object for later plotting
d0 <- plotCounts(dds_all, gene = Col1a1, intgroup ="genotype", returnData=TRUE)
d1 <- plotCounts(dds_ko_wt, gene = Col1a1, intgroup ="genotype", returnData=TRUE)
d2 <- plotCounts(dds_uk_wt, gene = Col1a1, intgroup ="genotype", returnData=TRUE)
d3 <- plotCounts(dds_tissue, gene = Col1a1, intgroup ="tissue", returnData=TRUE)
# What is the data output of plotCounts()?
# d0 %>% Show()
# Plot the normalized counts, using the sampletypes (rownames(d) as labels)
c0_plot <- ggplot(d1, aes(x = genotype, y = count, color = genotype)) +
geom_point(position=position_jitter(w = 0.2,h = 0)) +
geom_text_repel(aes(label = rownames(d1))) + theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Col1a1 Counts by Genotype (WT vs KO)")
c1_plot <- ggplot(d3, aes(x = tissue, y = count, color = tissue)) +
geom_point(position=position_jitter(w = 0.2,h = 0)) +
geom_text_repel(aes(label = rownames(d3))) + theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Col1a1 Counts by Tissue (Bone vs. Organ)")
c0_plot
c1_plot
# Note that for mouse, capitalize first letter, then lower case.
# For human genes, all caps.
mus38[mus38$symbol == "Bglap3", "ensgene"]
# Osteoprogenitor, osteoblast and osteocyte markers
Col1a1 <- "ENSMUSG00000001506"
Col2a1 <- "ENSMUSG00000022483"
Dcn <- "ENSMUSG00000019929"
Sp7 <- "ENSMUSG00000060284"
Runx2 <- "ENSMUSG00000039153"
Bglap <- "ENSMUSG00000074483"
Bglap3 <- "ENSMUSG00000074489"
Spp1 <- "ENSMUSG00000029304"
Sparc <- "ENSMUSG00000018593"
Tmem119 <- "ENSMUSG00000054675"
Dmp1 <- "ENSMUSG00000029307"
Fgf23 <- "ENSMUSG00000000182"
Sost <- "ENSMUSG00000001494"
CD200 <- "ENSMUSG00000022661" # ox-2
Itgb1 <- "ENSMUSG00000025809" # cd29
Thy1 <- "ENSMUSG00000032011" # cd90
# Save plotcounts to a data frame object
cplot1a <- plotCounts(dds_all, gene=Col1a1, intgroup="genotype", returnData=TRUE)
cplot2a <- plotCounts(dds_all, gene=Col2a1, intgroup="genotype", returnData=TRUE)
cplot3a <- plotCounts(dds_all, gene=Dcn, intgroup="genotype", returnData=TRUE)
cplot4a <- plotCounts(dds_all, gene=Sp7, intgroup="genotype", returnData=TRUE)
cplot5a <- plotCounts(dds_all, gene=Runx2, intgroup="genotype", returnData=TRUE)
cplot6a <- plotCounts(dds_all, gene=Bglap, intgroup="genotype", returnData=TRUE)
cplot7a <- plotCounts(dds_all, gene=Spp1, intgroup="genotype", returnData=TRUE)
cplot8a <- plotCounts(dds_all, gene=Sparc, intgroup="genotype", returnData=TRUE)
cplot9a <- plotCounts(dds_all, gene=Tmem119, intgroup="genotype", returnData=TRUE)
cplot10a <- plotCounts(dds_all, gene=Dmp1, intgroup="genotype", returnData=TRUE)
cplot11a <- plotCounts(dds_all, gene=Fgf23, intgroup="genotype", returnData=TRUE)
cplot12a <- plotCounts(dds_all, gene=Sost, intgroup="genotype", returnData=TRUE)
cplot13a <- plotCounts(dds_all, gene=Bglap3, intgroup="genotype", returnData=TRUE)
cplot14a <- plotCounts(dds_all, gene=CD200, intgroup="genotype", returnData=TRUE)
cplot15a <- plotCounts(dds_all, gene=Itgb1, intgroup="genotype", returnData=TRUE)
cplot16a <- plotCounts(dds_all, gene=Thy1, intgroup="genotype", returnData=TRUE)
#Set p-adjusted value cut off
padj_cutoff <- 0.15 ############## THIS SHOULD BE 0.05 OR LESS ##############
padj_cutoff2 <- 0.05 ############## THIS SHOULD BE 0.05 OR LESS ##############
sig_res_ko_wt <- dplyr::filter(res_ko_wt_tb, padj < padj_cutoff) %>% arrange(padj)
sig_res_uk_wt <- dplyr::filter(res_uk_wt_tb, padj < padj_cutoff) %>% arrange(padj)
sig_res_tissue <- dplyr::filter(res_tissue_tb, padj < padj_cutoff2) %>% arrange(padj)
#dim(sig_res_ko_wt) # Look at rows to see how many DEGs were found
## Order results by padj values
top20_ko_wt <- sig_res_ko_wt %>% arrange(padj) %>% pull(gene) %>% head(n=20)
top20_uk_wt <- sig_res_uk_wt %>% arrange(padj) %>% pull(gene) %>% head(n=20)
top20_tissue <- sig_res_tissue %>% arrange(padj) %>% pull(gene) %>% head(n=20)
## Normalized counts for top 20 significant genes
top20_sig_ko_wt <- norm_counts_ko_wt %>% dplyr::filter(gene %in% top20_ko_wt)
top20_sig_uk_wt <- norm_counts_uk_wt %>% dplyr::filter(gene %in% top20_uk_wt)
top20_sig_tissue <- norm_counts_tissue %>% dplyr::filter(gene %in% top20_tissue)
# Gathering the columns to have normalized counts to a single column
gathered_top20_sig_ko_wt <- top20_sig_ko_wt %>% gather(colnames(top20_sig_ko_wt)[3:12], key = "sample", value = "normalized_counts")
gathered_top20_sig_uk_wt <- top20_sig_uk_wt %>% gather(colnames(top20_sig_uk_wt)[3:10], key = "sample", value = "normalized_counts")
gathered_top20_tissue <- top20_sig_tissue %>% gather(colnames(top20_sig_tissue)[3:16], key = "sample", value = "normalized_counts")
# Check the column header in the "gathered" data frame
# View(gathered_top20_sig_all)
# Combine metadata with the melted normalized countsinto the same data frame for input to ggplot()
gathered_top20_sig_ko_wt <- inner_join(meta, gathered_top20_sig_ko_wt)
gathered_top20_sig_uk_wt <- inner_join(meta, gathered_top20_sig_uk_wt)
gathered_top20_tissue <- inner_join(meta, gathered_top20_tissue)
bk <- ggplot(gathered_top20_sig_ko_wt) + geom_point(aes(x = symbol, y = normalized_counts,
color = genotype)) + xlab("Genes") + ylab("log10 Normalized Counts") +
ggtitle("Top 20 DEGs Genotype (WT vs KO)") + theme_bw() + scale_y_log10() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(plot.title = element_text(hjust = 0.5))
bt <- ggplot(gathered_top20_tissue) + geom_point(aes(x = symbol, y = normalized_counts,
color = tissue)) + xlab("Genes") + ylab("log10 Normalized Counts") +
ggtitle("Top 20 DEGs (Tissue)") + theme_bw() + scale_y_log10() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(plot.title = element_text(hjust = 0.5))
bk
bt
# TRUE values denote padj values < 0.05 and fold change > 1.5 in either direction
res_ko_wt_tb <- res_ko_wt_tb %>% mutate(threshold_OE=padj < 0.15 & abs(log2FoldChange) >= 0.15) ############## THIS SHOULD BE 0.05 OR LESS ##############
res_uk_wt_tb <- res_uk_wt_tb %>% mutate(threshold_OE=padj < 0.05 & abs(log2FoldChange) >= 0.05) ############## THIS SHOULD BE 0.05 OR LESS ##############
res_tissue_tb <- res_tissue_tb %>% mutate(threshold_OE=padj < 0.05 & abs(log2FoldChange) >= 0.05) ############## THIS SHOULD BE 0.05 OR LESS ##############
# Log2FC Volcano plot
ggplot(res_ko_wt_tb) + geom_point(aes(x = log2FoldChange, y = -log10(padj),
colour = threshold_OE)) + ggtitle("Genotype (WT vs KO)") + xlab("log2 fold change") +
ylab("-log10 adjusted p-value") + scale_y_continuous(limits = c(0,50)) +
theme(legend.position = "none", plot.title = element_text(size = rel(1.5), hjust = 0.5),
axis.title = element_text(size = rel(1.25)))
ggplot(res_tissue_tb) + geom_point(aes(x = log2FoldChange, y = -log10(padj),
colour = threshold_OE)) + ggtitle("Tissue (Bone. vs Organ)") + xlab("log2 fold change") +
ylab("-log10 adjusted p-value") + scale_y_continuous(limits = c(0,50)) +
theme(legend.position = "none", plot.title = element_text(size = rel(1.5), hjust = 0.5),
axis.title = element_text(size = rel(1.25)))
# Add all the gene symbols as a column from the grch38 table using bind_cols()
res_ko_wt_tb <- bind_cols(res_ko_wt_tb, symbol = mus38$symbol[match(res_ko_wt_tb$gene, mus38$ensgene)])
res_uk_wt_tb <- bind_cols(res_uk_wt_tb, symbol = mus38$symbol[match(res_uk_wt_tb$gene, mus38$ensgene)])
res_tissue_tb <- bind_cols(res_tissue_tb, symbol = mus38$symbol[match(res_tissue_tb$gene, mus38$ensgene)])
# Create an empty column to indicate which genes to label
res_ko_wt_tb <- res_ko_wt_tb %>% mutate(genelabels = "")
res_uk_wt_tb <- res_uk_wt_tb %>% mutate(genelabels = "")
res_tissue_tb <- res_tissue_tb %>% mutate(genelabels = "")
# Sort by padj values
res_ko_wt_tb <- res_ko_wt_tb %>% arrange(padj)
res_uk_wt_tb <- res_uk_wt_tb %>% arrange(padj)
res_tissue_tb <- res_tissue_tb %>% arrange(padj)
# Populate the genelabels column with contents of the gene symbols column for the first 20 rows,
# i.e. the top 20 most significantly expressed genes
res_ko_wt_tb$genelabels[1:20] <- as.character(res_ko_wt_tb$symbol[1:20])
res_uk_wt_tb$genelabels[1:20] <- as.character(res_uk_wt_tb$symbol[1:20])
res_tissue_tb$genelabels[1:20] <- as.character(res_tissue_tb$symbol[1:20])
log2fc_wt_ko_plot <- ggplot(res_ko_wt_tb, aes(x = log2FoldChange, y = -log10(padj))) +
xlab("log2 fold change") + geom_point(aes(colour = threshold_OE)) +
ylim(0,25) + ylab("-log10 adjusted p-value") +
geom_text_repel(aes(label = genelabels)) +
ggtitle("Log2FC Genotype (WT vs KO)") +
theme(legend.position = "none",
plot.title = element_text(size = rel(1.5), hjust = 0.5),
axis.title = element_text(size = rel(1.25)))
log2fc_tissue_plot <- ggplot(res_tissue_tb, aes(x = log2FoldChange, y = -log10(padj))) +
xlab("log2 fold change") + geom_point(aes(colour = threshold_OE)) +
ylim(0,300) + ylab("-log10 adjusted p-value") +
geom_text_repel(aes(label = genelabels)) +
ggtitle("Log2FC Tissue (Bone vs. Organ)") +
theme(legend.position = "none",
plot.title = element_text(size = rel(1.5), hjust = 0.5),
axis.title = element_text(size = rel(1.25)))
log2fc_wt_ko_plot
log2fc_tissue_plot